Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/mlfoundations/open_clip/llms.txt

Use this file to discover all available pages before exploring further.

OpenCLIP provides flexible image preprocessing with automatic configuration based on model requirements and customizable augmentation strategies.

Quick Start

Preprocessing is automatically configured when loading models:
import open_clip
from PIL import Image

model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k'
)

# Use for training
image_train = preprocess_train(Image.open('train.jpg'))

# Use for inference/validation
image_val = preprocess_val(Image.open('val.jpg'))

Preprocessing Configuration

PreprocessCfg

The PreprocessCfg dataclass defines all preprocessing parameters:
from open_clip import PreprocessCfg

config = PreprocessCfg(
    size=224,                          # Image size
    mode='RGB',                        # Color mode
    mean=(0.48145466, 0.4578275, 0.40821073),  # Normalization mean
    std=(0.26862954, 0.26130258, 0.27577711),  # Normalization std
    interpolation='bicubic',           # Resize interpolation
    resize_mode='shortest',            # Resize strategy
    fill_color=0                       # Padding fill color
)
size
int | Tuple[int, int]
default:"224"
Target image size. Can be int for square images or (height, width) tuple.
mean
Tuple[float, float, float]
RGB mean values for normalization. Defaults to OpenAI CLIP values: (0.48145466, 0.4578275, 0.40821073)
std
Tuple[float, float, float]
RGB standard deviation for normalization. Defaults to: (0.26862954, 0.26130258, 0.27577711)
interpolation
str
default:"bicubic"
Resize interpolation method: ‘bicubic’, ‘bilinear’, or ‘nearest’
resize_mode
str
default:"shortest"
Resize strategy:
  • 'shortest': Resize shortest edge, then center crop
  • 'longest': Resize longest edge, then center crop/pad
  • 'squash': Direct resize to target size (may distort)

Creating Transforms

image_transform_v2()

Create preprocessing transforms from configuration:
from open_clip import image_transform_v2, PreprocessCfg, AugmentationCfg

# Create config
preprocess_cfg = PreprocessCfg(
    size=224,
    mean=(0.485, 0.456, 0.406),
    std=(0.229, 0.224, 0.225),
    interpolation='bicubic',
    resize_mode='shortest'
)

# Training transform with augmentation
train_transform = image_transform_v2(
    preprocess_cfg,
    is_train=True,
    aug_cfg={'scale': (0.9, 1.0), 'color_jitter': (0.4, 0.4, 0.4, 0.1)}
)

# Validation transform (no augmentation)
val_transform = image_transform_v2(
    preprocess_cfg,
    is_train=False
)

Resize Modes

Shortest Edge (Default)

Resize shortest edge to target, then center crop:
from open_clip import image_transform_v2, PreprocessCfg

config = PreprocessCfg(size=224, resize_mode='shortest')
transform = image_transform_v2(config, is_train=False)

# Example: 800x600 image -> resize to 224x168 -> center crop to 224x224
This is the default for most CLIP models and preserves aspect ratio before cropping.

Longest Edge

Resize longest edge, pad to square:
config = PreprocessCfg(size=224, resize_mode='longest', fill_color=0)
transform = image_transform_v2(config, is_train=False)

# Example: 800x600 image -> resize to 224x168 -> pad to 224x224
Useful when preserving all image content is important.

Squash Mode

Direct resize (may distort aspect ratio):
config = PreprocessCfg(size=224, resize_mode='squash')
transform = image_transform_v2(config, is_train=False)

# Example: 800x600 image -> resize to 224x224 (distorted)
Used by SigLIP models and some other architectures.

Augmentation Configuration

AugmentationCfg

Configure training data augmentation:
from open_clip import AugmentationCfg

aug_cfg = AugmentationCfg(
    scale=(0.9, 1.0),                    # Random crop scale range
    ratio=(0.75, 1.33),                  # Random crop aspect ratio
    color_jitter=(0.4, 0.4, 0.4, 0.1),  # (brightness, contrast, saturation, hue)
    color_jitter_prob=0.8,               # Probability of applying color jitter
    gray_scale_prob=0.2,                 # Probability of grayscale conversion
    use_timm=False                       # Use timm augmentation library
)
scale
Tuple[float, float]
default:"(0.9, 1.0)"
Scale range for RandomResizedCrop. Values are fractions of original image size.
ratio
Tuple[float, float]
Aspect ratio range for RandomResizedCrop
color_jitter
Tuple[float, ...]
Color jitter parameters: (brightness, contrast, saturation, hue)
color_jitter_prob
float
Probability of applying color jitter (0.0 to 1.0)
gray_scale_prob
float
Probability of converting to grayscale (0.0 to 1.0)
use_timm
bool
default:"False"
Use timm library’s augmentation (RandAugment, etc.)

Example Augmentation Configs

# Light augmentation
light_aug = AugmentationCfg(
    scale=(0.95, 1.0),
    color_jitter=(0.2, 0.2, 0.2, 0.05),
    color_jitter_prob=0.5
)

# Standard augmentation (default)
standard_aug = AugmentationCfg(
    scale=(0.9, 1.0),
    color_jitter=(0.4, 0.4, 0.4, 0.1),
    color_jitter_prob=0.8,
    gray_scale_prob=0.2
)

# Strong augmentation
strong_aug = AugmentationCfg(
    scale=(0.8, 1.0),
    color_jitter=(0.6, 0.6, 0.6, 0.2),
    color_jitter_prob=1.0,
    gray_scale_prob=0.3,
    use_timm=True,
    re_prob=0.25  # Random erasing
)

Normalization

Standard Normalization Values

Different model families use different normalization:
from open_clip.constants import (
    OPENAI_DATASET_MEAN,  # (0.48145466, 0.4578275, 0.40821073)
    OPENAI_DATASET_STD,   # (0.26862954, 0.26130258, 0.27577711)
    IMAGENET_MEAN,        # (0.485, 0.456, 0.406)
    IMAGENET_STD,         # (0.229, 0.224, 0.225)
    INCEPTION_MEAN,       # (0.5, 0.5, 0.5)
    INCEPTION_STD         # (0.5, 0.5, 0.5)
)

# OpenAI CLIP models (default)
config = PreprocessCfg(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD)

# SigLIP models
config = PreprocessCfg(mean=INCEPTION_MEAN, std=INCEPTION_STD)

# CLIPA models
config = PreprocessCfg(mean=IMAGENET_MEAN, std=IMAGENET_STD)
Using incorrect normalization values will significantly degrade model performance. Always use the values the model was trained with.

Custom Preprocessing

Override Model Defaults

import open_clip

model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k',
    # Override defaults
    image_mean=(0.5, 0.5, 0.5),
    image_std=(0.5, 0.5, 0.5),
    image_interpolation='bilinear',
    image_resize_mode='squash'
)

Manual Transform Pipeline

from torchvision import transforms
from open_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD

# Custom preprocessing pipeline
custom_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=OPENAI_DATASET_MEAN,
        std=OPENAI_DATASET_STD
    )
])

# Use with model
from PIL import Image
image = Image.open('example.jpg')
image_tensor = custom_transform(image).unsqueeze(0)

Advanced Features

Non-Square Images

Some models support non-square inputs:
# Create model with custom image size
model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k',
    force_image_size=(384, 256)  # Width x Height
)

Multiple Resolutions

Use different resolutions at inference:
from open_clip import create_model_from_pretrained, image_transform_v2, PreprocessCfg

# Load base model
model, base_preprocess = create_model_from_pretrained(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k'
)

# Create higher resolution transform
high_res_cfg = PreprocessCfg(
    size=336,  # Instead of default 224
    mean=(0.48145466, 0.4578275, 0.40821073),
    std=(0.26862954, 0.26130258, 0.27577711),
    interpolation='bicubic'
)
high_res_preprocess = image_transform_v2(high_res_cfg, is_train=False)

Batch Preprocessing

import torch
from PIL import Image

images = [Image.open(f'img{i}.jpg') for i in range(10)]

# Preprocess batch
batch = torch.stack([preprocess(img) for img in images])
print(batch.shape)  # [10, 3, 224, 224]

Complete Example

import torch
import open_clip
from open_clip import PreprocessCfg, AugmentationCfg, image_transform_v2
from PIL import Image

# Configure preprocessing
preprocess_cfg = PreprocessCfg(
    size=224,
    mean=(0.48145466, 0.4578275, 0.40821073),
    std=(0.26862954, 0.26130258, 0.27577711),
    interpolation='bicubic',
    resize_mode='shortest'
)

# Configure augmentation for training
aug_cfg = AugmentationCfg(
    scale=(0.9, 1.0),
    color_jitter=(0.4, 0.4, 0.4, 0.1),
    color_jitter_prob=0.8,
    gray_scale_prob=0.2
)

# Create transforms
train_transform = image_transform_v2(preprocess_cfg, is_train=True, aug_cfg=aug_cfg)
val_transform = image_transform_v2(preprocess_cfg, is_train=False)

# Load model
model, _, _ = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k'
)
model.eval()

# Process images
train_img = train_transform(Image.open('train.jpg'))
val_img = val_transform(Image.open('val.jpg'))

# Inference
with torch.no_grad():
    features = model.encode_image(val_img.unsqueeze(0))
    print("Features shape:", features.shape)